Hankowsky Homework Solutions
#read in data
income <- Sleuth3::ex0828
#looking at the dataframe
head(income)
## Subject AFQT Educ Income2005
## 1 2 6.841 12 5500
## 2 6 99.393 16 65000
## 3 7 47.412 12 19000
## 4 8 44.022 14 36000
## 5 9 59.683 14 65000
## 6 13 72.313 16 8000
#plot the 2005 income as a function of AFQT score
income %>%
ggplot() +
geom_jitter(aes(x = AFQT, y = Income2005)) +
theme_classic()
#plot the 2005 income as a function of AFQT score with group means
# income %>%
# ggplot(aes(x = AFQT, y = Income2005)) +
# geom_jitter(alpha = 0.4) +
# stat_summary(fun = "mean", colour = "red", size = 2, geom = "point") +
# stat_summary(fun = "mean", colour = "red", geom = "line") +
# stat_summary(fun = mean,
# geom = "errorbar",
# fun.max = function(x) mean(x) + sd(x),
# fun.min = function(x) mean(x) - sd(x), col = "red") +
# theme_classic()
#plot it again with the regression line and smooth
income %>%
ggplot(aes(x = AFQT, y = Income2005)) +
geom_jitter(alpha = 0.6) +
geom_smooth(method = "lm", se = F) +
geom_smooth(method = stats::loess, se = FALSE, col = "red", lty = 2) +
theme_classic()
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
#run the regression
income_lm <- lm(Income2005 ~ AFQT, data = income)
summary(income_lm)
##
## Call:
## lm(formula = Income2005 ~ AFQT, data = income)
##
## Residuals:
## Min 1Q Median 3Q Max
## -72004 -24075 -7815 12447 643506
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 21181.66 1925.59 11.00 <2e-16 ***
## AFQT 518.68 31.51 16.46 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44460 on 2582 degrees of freedom
## Multiple R-squared: 0.09496, Adjusted R-squared: 0.09461
## F-statistic: 270.9 on 1 and 2582 DF, p-value: < 2.2e-16
anova(income_lm)
## Analysis of Variance Table
##
## Response: Income2005
## Df Sum Sq Mean Sq F value Pr(>F)
## AFQT 1 5.3556e+11 5.3556e+11 270.91 < 2.2e-16 ***
## Residuals 2582 5.1044e+12 1.9769e+09
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
confint(income_lm)
## 2.5 % 97.5 %
## (Intercept) 17405.7989 24957.5148
## AFQT 456.8886 580.4756
#plot the diagnostics
par(mfrow=c(2,2))
plot(income_lm)
###############################################################################
#log transform the data because of the trumpetting in the residual plot
income <- income %>%
mutate(log_income = log(Income2005))
#plot the log(2005 income) as a function of AFQT score
income %>%
ggplot() +
geom_jitter(aes(x = AFQT, y = log_income)) +
theme_classic()
#plot it again with the regression line and smooth
income %>%
ggplot(aes(x = AFQT, y = log_income)) +
geom_jitter(alpha = 0.6) +
geom_smooth(method = "lm", se = F) +
geom_smooth(method = stats::loess, se = FALSE, col = "red", lty = 2) +
theme_classic()
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
#run the regression
income_lmb <- lm(log_income ~ AFQT, data = income)
summary(income_lmb)
##
## Call:
## lm(formula = log_income ~ AFQT, data = income)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.4608 -0.3784 0.1224 0.5661 2.8794
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.868419 0.040267 245.08 <2e-16 ***
## AFQT 0.010458 0.000659 15.87 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9298 on 2582 degrees of freedom
## Multiple R-squared: 0.08887, Adjusted R-squared: 0.08851
## F-statistic: 251.8 on 1 and 2582 DF, p-value: < 2.2e-16
anova(income_lmb)
## Analysis of Variance Table
##
## Response: log_income
## Df Sum Sq Mean Sq F value Pr(>F)
## AFQT 1 217.71 217.706 251.84 < 2.2e-16 ***
## Residuals 2582 2232.07 0.864
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
confint(income_lmb)
## 2.5 % 97.5 %
## (Intercept) 9.789461406 9.94737749
## AFQT 0.009165405 0.01174977
#back-transform the estimates
exp(income_lmb$coefficients)
## (Intercept) AFQT
## 19310.793100 1.010512
exp(confint(income_lmb))
## 2.5 % 97.5 %
## (Intercept) 17844.692528 20897.346904
## AFQT 1.009208 1.011819
#plot the diagnostics
par(mfrow=c(2,2))
plot(income_lmb)
#plot the presentation graphic
par(mfrow=c(1,1))
gf_jitter(log_income ~ AFQT, data = income) %>%
gf_lm(interval = "prediction") %>%
gf_lm(interval = "confidence", alpha = 0.5) +
labs(title = "Log(2005 Income) vs AFQT Score",
y = "Log(2005 Income)", x = "AFQT Score") +
# geom_text(aes(7,600000, label = paste("R2 = ", signif(rsquared(income_lm), 3), "\n",
# "Intercept =",signif(income_lm$coef[[1]], 4), "\n",
# " Slope =",signif(income_lm$coef[[2]], 4)))) +
theme_classic()
The residual diagnostic plot of the raw 2005 income as a function of AFQT score showed a pattern of trumpeting, so a log transformation was conducted. The diagnostics plots of the log-transformed income have no problematic trends, so the interpretation of the regression was performed with the log-transformed income data. These data provide overwhelming evidence that the 2005 income is associated with the IQ test score (AFQT score) (\(\beta\) = 1.01, p-value < 0.0001). The IQ test score explained 9% of the variance in the 2005 income (\(R^2\) = 0.088). The mean salary increases 1.01 times for each increase in AFQT scores (95% confidence intervals: 1.01 to 1.01).
#plot the 2005 income as a function of education
income %>%
ggplot() +
geom_jitter(aes(x = Educ, y = Income2005)) +
theme_classic()
#plot the 2005 income as a function of education with group means
income %>%
ggplot(aes(x = Educ, y = Income2005)) +
geom_jitter(alpha = 0.4) +
stat_summary(fun = "mean", colour = "red", size = 2, geom = "point") +
stat_summary(fun = "mean", colour = "red", geom = "line") +
stat_summary(fun = mean,
geom = "errorbar",
fun.max = function(x) mean(x) + sd(x),
fun.min = function(x) mean(x) - sd(x), col = "red") +
theme_classic()
#plot it again with the regression line and smooth
income %>%
ggplot(aes(x = Educ, y = Income2005)) +
geom_jitter(alpha = 0.6) +
geom_smooth(method = "lm", se = F) +
geom_smooth(method = stats::loess, se = FALSE, col = "red", lty = 2) +
theme_classic()
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
#run the regression
income_lm2 <- lm(Income2005 ~ Educ, data = income)
summary(income_lm2)
##
## Call:
## lm(formula = Income2005 ~ Educ, data = income)
##
## Residuals:
## Min 1Q Median 3Q Max
## -88660 -24121 -7686 12782 614807
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -40199.6 4865.0 -8.263 2.24e-16 ***
## Educ 6451.5 344.7 18.717 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 43860 on 2582 degrees of freedom
## Multiple R-squared: 0.1195, Adjusted R-squared: 0.1191
## F-statistic: 350.3 on 1 and 2582 DF, p-value: < 2.2e-16
anova(income_lm2)
## Analysis of Variance Table
##
## Response: Income2005
## Df Sum Sq Mean Sq F value Pr(>F)
## Educ 1 6.7382e+11 6.7382e+11 350.33 < 2.2e-16 ***
## Residuals 2582 4.9662e+12 1.9234e+09
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
confint(income_lm2)
## 2.5 % 97.5 %
## (Intercept) -49739.365 -30659.786
## Educ 5775.593 7127.356
#plot the diagnostics
par(mfrow=c(2,2))
plot(income_lm2)
###############################################################################
#plot the log(2005 income) as a function of education
income %>%
ggplot() +
geom_jitter(aes(x = Educ, y = log_income)) +
theme_classic()
#plot the log(2005 income) as a function of education with group means
income %>%
ggplot(aes(x = Educ, y = log_income)) +
geom_jitter(alpha = 0.4) +
stat_summary(fun = "mean", colour = "red", size = 2, geom = "point") +
stat_summary(fun = "mean", colour = "red", geom = "line") +
stat_summary(fun = mean,
geom = "errorbar",
fun.max = function(x) mean(x) + sd(x),
fun.min = function(x) mean(x) - sd(x), col = "red") +
theme_classic()
#plot it again with the regression line and smooth
income %>%
ggplot(aes(x = Educ, y = log_income)) +
geom_jitter(alpha = 0.6) +
geom_smooth(method = "lm", se = F) +
geom_smooth(method = stats::loess, se = FALSE, col = "red", lty = 2) +
theme_classic()
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
#run the regression
income_lm2b <- lm(log_income ~ Educ, data = income)
summary(income_lm2b)
##
## Call:
## lm(formula = log_income ~ Educ, data = income)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.8671 -0.3487 0.1441 0.5772 2.6981
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.880995 0.103473 85.83 <2e-16 ***
## Educ 0.112067 0.007331 15.29 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9328 on 2582 degrees of freedom
## Multiple R-squared: 0.08299, Adjusted R-squared: 0.08264
## F-statistic: 233.7 on 1 and 2582 DF, p-value: < 2.2e-16
anova(income_lm2b)
## Analysis of Variance Table
##
## Response: log_income
## Df Sum Sq Mean Sq F value Pr(>F)
## Educ 1 203.32 203.32 233.69 < 2.2e-16 ***
## Residuals 2582 2246.46 0.87
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
confint(income_lm2b)
## 2.5 % 97.5 %
## (Intercept) 8.67809691 9.0838926
## Educ 0.09769147 0.1264416
#plot the diagnostics
par(mfrow=c(2,2))
plot(income_lm2b)
#back-transform the estimates
exp(income_lm2b$coefficients)
## (Intercept) Educ
## 7193.943312 1.118587
exp(confint(income_lm2b))
## 2.5 % 97.5 %
## (Intercept) 5872.859360 8812.201552
## Educ 1.102623 1.134783
#plot the presentation graphic
par(mfrow=c(1,1))
gf_jitter(log_income ~ Educ, data = income) %>%
gf_lm(interval = "prediction") %>%
gf_lm(interval = "confidence", alpha = 0.5) +
labs(title = "Log(2005 Income) vs Years of Education",
y = "Log(2005 Income)", x = "Years of Education") +
theme_classic()
Investigation of the means of sub-group distributions shows a straight line regression would be appropriate, but the variability increases as the mean of 2005 income increases. A weighted regression would be appropriate for this situation, however, those are not covered in the Slueth book until chapter 11, so a simple linear regression was conducted. The residual diagnostic plot of the raw 2005 income as a function of years of education showed a pattern of trumpeting, so a log transformation was conducted. The diagnostics plots of the log-transformed income have no problematic trends, so the interpretation of the regression was performed with the log-transformed income data. These data provide overwhelming evidence that the 2005 income is associated with the number of years of education (\(\beta\) = 1.12, p-value < 0.0001). The number of years of education explained 8% of the variance in the 2005 income (\(R^2\) = 0.0082). The mean salary increases 1.12 times for each increase in year of education (95% confidence intervals: 1.10 to 1.13).
#read in data
deposit_feeders <- Sleuth3::ex0921
#looking at the dataframe
head(deposit_feeders)
## Species Weight Ingestion Organic
## 1 Hydrobia neglecta 0.20 0.57 18.0
## 2 Hydrobia ventrosa 0.20 0.86 17.0
## 3 Tubifex tubifex 0.27 0.43 29.7
## 4 Hyalella azteca 0.32 0.43 50.0
## 5 Potamopyrgus jenkinsi 0.46 2.70 14.4
## 6 Hydrobia ulvae 0.90 0.67 13.0
#transforming the data as suggested in the exercise
deposit_feeders <- deposit_feeders %>%
mutate(log_weight = log(Weight +1),
log_ingestion = log(Ingestion +1),
log_organic = log(Organic +1))
#looking at scatterplots of the variables
ggpairs(deposit_feeders, columns = 5:7)
#run the mutiple regression
deposit_feeders_lm <- lm(log_ingestion ~ log_weight + log_organic + log_weight:log_organic, data = deposit_feeders)
summary(deposit_feeders_lm)
##
## Call:
## lm(formula = log_ingestion ~ log_weight + log_organic + log_weight:log_organic,
## data = deposit_feeders)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.92998 -0.24353 -0.00468 0.38169 0.74882
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.41061 0.68647 4.968 0.000168 ***
## log_weight 0.89804 0.13483 6.661 7.6e-06 ***
## log_organic -0.95565 0.22866 -4.179 0.000806 ***
## log_weight:log_organic -0.01311 0.05683 -0.231 0.820672
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5071 on 15 degrees of freedom
## Multiple R-squared: 0.9796, Adjusted R-squared: 0.9755
## F-statistic: 239.6 on 3 and 15 DF, p-value: 6.847e-13
confint(deposit_feeders_lm)
## 2.5 % 97.5 %
## (Intercept) 1.9474376 4.8737896
## log_weight 0.6106594 1.1854297
## log_organic -1.4430189 -0.4682821
## log_weight:log_organic -0.1342289 0.1080110
#back-transform the estimates
exp(deposit_feeders_lm$coefficients)
## (Intercept) log_weight log_organic
## 30.2838202 2.4547981 0.3845619
## log_weight:log_organic
## 0.9869766
exp(confint(deposit_feeders_lm))
## 2.5 % 97.5 %
## (Intercept) 7.0107000 130.8157202
## log_weight 1.8416453 3.2720925
## log_organic 0.2362136 0.6260769
## log_weight:log_organic 0.8743899 1.1140600
#plot the diagnostics
par(mfrow=c(2,2))
plot(deposit_feeders_lm)
#plot the presentation graphic
These data provide strong evidence that the ingestion rate of deposit feeders is associated with the percentage of organic matter in the food after accounting for the effect of species weight (p-value < 0.0001, multiple regression). These data also provide overwhelming evidence the the ingestion rate of deposit feeders is associated with the species weight after accounting for the percentage of organic matter in the food (p-value < 0.0001, multiple regression). However, there is no evidence of an interaction between the percentage of organic matter in the food and the species weight (p-value = 0.82, multiple regression).
#read in data
mammals <- Sleuth3::ex0826
#looking at the dataframe
head(mammals)
## CommonName Species Mass Metab Life
## 1 Echidna Tachiglossus aculeatus 2.500 302 14
## 2 Long-beaked echidna Zaglossus bruijni 10.300 594 20
## 3 Platypus Ornithorhynchus anatinus 1.300 229 9
## 4 Opossum Lutreolina crassicaudata 0.812 196 5
## 5 South American opossum Didelphis marsupialis 1.330 299 6
## 6 Virginia opossum Didelphis virginiana 3.260 519 8
#transforming the data as described in the question
mammals <- mammals %>%
mutate(mass_3 = (Mass^(3/4)),
log_metab = log(Metab),
log_mass = log(Mass))
#plot the metabolic rate by mass
mammals %>%
ggplot() +
geom_point(aes( x = Mass, y = Metab)) +
labs(title = "Metabolic Rate v Mass",
x = "Mass (kg)", y = "Average Basal Metabolic Rate (kJ/Day)") +
theme_classic()
#plot it again with the regression line and smooth
mammals %>%
ggplot(aes( x = Mass, y = Metab)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = F) +
geom_smooth(method = stats::loess, se = FALSE, col = "red", lty = 2) +
labs(title = "Metabolic Rate v Mass",
x = "Mass (kg)", y = "Average Basal Metabolic Rate (kJ/Day)") +
theme_classic()
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
# #plot the metabolic rate by mass_3
# mammals %>%
# ggplot() +
# geom_point(aes( x = mass_3, y = Metab)) +
# labs(title = expression("Average Basal Metabolic Rate (kJ/Day) v Mass"^(3/4)* "(kg)"),
# x = expression(paste(Mass^(3/4), "(kg)")),
# y = "Average Basal Metabolic Rate (kJ/Day)") +
# theme_classic()
#
# #plot it again with the regression line and smooth
# mammals %>%
# ggplot(aes( x = mass_3, y = Metab)) +
# geom_point(alpha = 0.6) +
# geom_smooth(method = "lm", se = F) +
# geom_smooth(method = stats::loess, se = FALSE, col = "red", lty = 2) +
# labs(title = expression("Average Basal Metabolic Rate (kJ/Day) v Mass"^(3/4)* "(kg)"),
# x = expression(paste(Mass^(3/4), "(kg)")),
# y = "Average Basal Metabolic Rate (kJ/Day)") +
# theme_classic()
#plot the metabolic rate by mass
mammals %>%
ggplot() +
geom_point(aes( x = log_mass, y = log_metab)) +
labs(title = "Log(Metabolic Rate) v Log(Mass)",
x = "Log(Mass (kg))", y = "Log(Average Basal Metabolic Rate (kJ/Day))") +
theme_classic()
#plot it again with the regression line and smooth
mammals %>%
ggplot(aes( x = log_mass, y = log_metab)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = F) +
geom_smooth(method = stats::loess, se = FALSE, col = "red", lty = 2) +
labs(title = "Log(Metabolic Rate) v Log(Mass)",
x = "Log(Mass (kg))", y = "Log(Average Basal Metabolic Rate (kJ/Day))") +
theme_classic()
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
#run the regression
mammals_lm <- lm(log_metab ~ log_mass, data = mammals)
summary(mammals_lm)
##
## Call:
## lm(formula = log_metab ~ log_mass, data = mammals)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.14216 -0.26466 -0.04889 0.25308 1.37616
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.63833 0.04709 119.73 <2e-16 ***
## log_mass 0.73874 0.01462 50.53 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4572 on 93 degrees of freedom
## Multiple R-squared: 0.9649, Adjusted R-squared: 0.9645
## F-statistic: 2553 on 1 and 93 DF, p-value: < 2.2e-16
anova(mammals_lm)
## Analysis of Variance Table
##
## Response: log_metab
## Df Sum Sq Mean Sq F value Pr(>F)
## log_mass 1 533.82 533.82 2553.4 < 2.2e-16 ***
## Residuals 93 19.44 0.21
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
confint(mammals_lm)
## 2.5 % 97.5 %
## (Intercept) 5.5448128 5.7318485
## log_mass 0.7097121 0.7677752
#back-transform the estimates
exp(mammals_lm$coefficients)
## (Intercept) log_mass
## 280.993255 2.093304
exp(confint(mammals_lm))
## 2.5 % 97.5 %
## (Intercept) 255.906668 308.539085
## log_mass 2.033406 2.154967
#plot the diagnostics
par(mfrow=c(2,2))
plot(mammals_lm)
#plot the presentation graphic
par(mfrow=c(1,1))
gf_point(log_metab ~ log_mass, data = mammals) %>%
gf_lm(interval = "prediction") %>%
gf_lm(interval = "confidence", alpha = 0.5) +
labs(title = expression("Log(Average Basal Metabolic Rate (kJ/Day)) v Log(Mass (kg))"),
y = "Log(Average Basal Metabolic Rate (kJ/Day))",
x = "Log(Mass (kg))") +
# geom_text(aes(5,5, label = paste("R2 = ", signif(rsquared(mammals_lm), 3), "\n",
# "Intercept =",signif(mammals_lm$coef[[1]], 4), "\n",
# " Slope =",signif(mammals_lm$coef[[2]], 4)))) +
theme_classic()
The loess smooth of the scatterplot of the raw metabolic rate and average mass demonstrated the need for a log-transformation. A log-transformation was conducted and the simple linear regression was conducted on the log-transformed data. These data provide overwhelming evidence that the metabolic rate is associated with the average mass in mammals (\(\beta\) = 2.09, p-value < 0.0001). The average mass raised to the power of 3/4 explained 97% of the variance in the metabolic rate (\(R^2\) = 0.965). Based on these data, there is strong justification that Kleiber’s Law holds true for mammal species, however, it would not be appropriate to generalize that claim to all animal species from this dataset. The mean metabolic rate increases 2.09 times for each increase in mass (95% confidence interval: 2.03 to 2.15 times).
#read in data
incomeb <- Sleuth3::ex0923
#looking at the dataframe
head(incomeb)
## Subject Gender AFQT Educ Income2005
## 1 2 female 6.841 12 5500
## 2 6 male 99.393 16 65000
## 3 7 male 47.412 12 19000
## 4 8 female 44.022 14 36000
## 5 9 male 59.683 14 65000
## 6 13 male 72.313 16 8000
#looking at scatterplots of the variables
p91 <- incomeb %>%
ggplot() +
geom_jitter(aes(x = Educ, y = Income2005, color = Gender, shape = Gender),
alpha = 0.4) +
labs(title = "2005 Income v Years of Education",
x = "Years of Education", y = "2005 Income ($)") +
theme_classic() +
theme(legend.position = c(0.2, 0.85))
p92 <- incomeb %>%
ggplot() +
geom_jitter(aes(x = AFQT, y = Income2005, color = Gender, shape = Gender),
alpha = 0.4) +
labs(title = "2005 Income v AFQT Scores",
x = "AFQT Score", y = "2005 Income ($)") +
theme_classic() +
theme(legend.position = c(0.2, 0.85))
grid.arrange(p91, p92, ncol=2)
#run the mutiple regression
incomeb_lm <- lm(Income2005 ~ Gender + AFQT + Educ, data = incomeb)
summary(incomeb_lm)
##
## Call:
## lm(formula = Income2005 ~ Gender + AFQT + Educ, data = incomeb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -104620 -20911 -4681 11105 604024
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -48760.12 4868.19 -10.016 < 2e-16 ***
## Gendermale 28463.29 1621.08 17.558 < 2e-16 ***
## AFQT 222.98 36.32 6.139 9.56e-10 ***
## Educ 5158.28 402.67 12.810 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 41080 on 2580 degrees of freedom
## Multiple R-squared: 0.228, Adjusted R-squared: 0.2271
## F-statistic: 253.9 on 3 and 2580 DF, p-value: < 2.2e-16
confint(incomeb_lm)
## 2.5 % 97.5 %
## (Intercept) -58306.080 -39214.1570
## Gendermale 25284.530 31642.0446
## AFQT 151.762 294.1969
## Educ 4368.699 5947.8634
#plot the diagnostics
par(mfrow=c(2,2))
plot(incomeb_lm)
###############################################################################
#log transform the data because of the trumpetting in the residual plot
incomeb <- incomeb %>%
mutate(log_income = log(Income2005))
#looking at scatterplots of the variables
p91 <- incomeb %>%
ggplot() +
geom_jitter(aes(x = Educ, y = log_income, color = Gender, shape = Gender),
alpha = 0.4) +
labs(title = "Log(2005 Income) v Years of Education",
x = "Years of Education", y = "Log(2005 Income ($))") +
theme_classic() +
theme(legend.position = "bottom")
p92 <- incomeb %>%
ggplot() +
geom_jitter(aes(x = AFQT, y = log_income, color = Gender, shape = Gender),
alpha = 0.4) +
labs(title = "Log(2005 Income) v AFQT Scores",
x = "AFQT Score", y = "Log(2005 Income ($))") +
theme_classic() +
theme(legend.position = "bottom")
grid.arrange(p91, p92, ncol=2)
#run the mutiple regression
incomeb_lmb <- lm(log_income ~ Gender + AFQT + Educ, data = incomeb)
summary(incomeb_lmb)
##
## Call:
## lm(formula = log_income ~ Gender + AFQT + Educ, data = incomeb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.0906 -0.3301 0.1404 0.5091 2.5452
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.7312115 0.1026287 85.076 < 2e-16 ***
## Gendermale 0.6245093 0.0341748 18.274 < 2e-16 ***
## AFQT 0.0059139 0.0007657 7.724 1.6e-14 ***
## Educ 0.0769506 0.0084888 9.065 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8661 on 2580 degrees of freedom
## Multiple R-squared: 0.2101, Adjusted R-squared: 0.2092
## F-statistic: 228.7 on 3 and 2580 DF, p-value: < 2.2e-16
confint(incomeb_lmb)
## 2.5 % 97.5 %
## (Intercept) 8.529968655 8.932454398
## Gendermale 0.557496441 0.691522169
## AFQT 0.004412575 0.007415313
## Educ 0.060305047 0.093596160
#plot the diagnostics
par(mfrow=c(2,2))
plot(incomeb_lmb)
#back-transform the estimates
exp(incomeb_lmb$coefficients)
## (Intercept) Gendermale AFQT Educ
## 6193.226825 1.867329 1.005931 1.079989
exp(confint(incomeb_lmb))
## 2.5 % 97.5 %
## (Intercept) 5064.287091 7573.831779
## Gendermale 1.746295 1.996753
## AFQT 1.004422 1.007443
## Educ 1.062161 1.098116
#plot the presentation graphic
incomeb %>%
ggplot(aes(x = Educ, y = log_income, color = Gender, shape = Gender)) +
geom_jitter(alpha = 0.4) +
geom_smooth(method = "lm", se = T) +
labs(title = "Effect of Gender and Years of Education on 2005 Income",
x = "Years of Education", y = "2005 Income") +
theme_classic()
## `geom_smooth()` using formula 'y ~ x'
The residual diagnostic plot of the raw 2005 income as a function of AFQT score, years of education, and gender showed a pattern of trumpeting, so a log transformation was conducted. The diagnostics plots of the log-transformed income have no problematic trends, so the interpretation of the regression was performed with the log-transformed income data. There is overwhelming evidence that the mean salary for males exceeds the mean salary for females with the same years of education and AFQT scores (p-value < 0.0001, multiple regression). The mean salary for males is 1.87 times greater than the mean salary for females with the same years of education and AFQT scores (95% confidence intervals: 1.75 to 1.99).
library(FSA)
## ## FSA v0.9.3. See citation('FSA') if used in publication.
## ## Run fishR() for related website and fishR('IFAR') for related book.
##
## Attaching package: 'FSA'
## The following object is masked from 'package:mosaic':
##
## perc
library(car)
## Loading required package: carData
## Registered S3 methods overwritten by 'car':
## method from
## hist.boot FSA
## confint.boot FSA
##
## Attaching package: 'car'
## The following object is masked from 'package:FSA':
##
## bootCase
## The following objects are masked from 'package:mosaic':
##
## deltaMethod, logit
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
library(magrittr)
##
## Attaching package: 'magrittr'
## The following objects are masked from 'package:testthat':
##
## equals, is_less_than, not
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(dplyr)
#read-in data
ruf <- read.csv("./data/RuffeSLRH.csv")
#transforming the data
ruf <- ruf %>%
filter(month == 7) %>%
mutate(logW = log10(wt),
logL = log10(tl)) %>%
select(-fishID, -day)
#looking at the dataframe
headtail(ruf)
## year month tl wt logW logL
## 1 1988 7 78 6.0 0.7781513 1.892095
## 2 1988 7 81 7.0 0.8450980 1.908485
## 3 1988 7 82 7.0 0.8450980 1.913814
## 1936 2004 7 137 28.0 1.4471580 2.136721
## 1937 2004 7 143 31.4 1.4969296 2.155336
## 1938 2004 7 174 82.4 1.9159272 2.240549
#filter for just select years and creating seperate datasets
ruf90 <- ruf %>%
filter(year == 1990)
ruf9000 <- ruf %>%
filter(year %in% c(1990,2000))
#creating the figures from chapter 7
pm1 <- ruf90 %>%
ggplot() +
geom_point(aes(x = tl, y = wt)) +
labs(x = "Total Length (mm)", y = "Weight (g)" ) +
theme_classic()
pm2 <- ruf90 %>%
ggplot() +
geom_point(aes(x = logL, y = logW)) +
labs(x = "Log(Total Length)", y = "Log(Weight)" ) +
theme_classic()
grid.arrange(pm1, pm2, ncol=2)
#fitting the regression
ruf_lm <- lm(logW ~ logL, data = ruf90)
summary(ruf_lm)
##
## Call:
## lm(formula = logW ~ logL, data = ruf90)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.11445 -0.02780 0.00245 0.02819 0.13892
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.91447 0.06617 -74.27 <2e-16 ***
## logL 3.02516 0.03211 94.20 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04287 on 144 degrees of freedom
## Multiple R-squared: 0.984, Adjusted R-squared: 0.9839
## F-statistic: 8874 on 1 and 144 DF, p-value: < 2.2e-16
anova(ruf_lm)
## Analysis of Variance Table
##
## Response: logW
## Df Sum Sq Mean Sq F value Pr(>F)
## logL 1 16.3104 16.3104 8874.3 < 2.2e-16 ***
## Residuals 144 0.2647 0.0018
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#back-transform the estimates
exp(ruf_lm$coefficients)
## (Intercept) logL
## 0.007339624 20.597373585
exp(confint(ruf_lm))
## 2.5 % 97.5 %
## (Intercept) 0.006439816 0.008365159
## logL 19.330603541 21.947157403
##################stealing the rest of the code from his website ##############
lens <- c(100,160) # vector of lengths
nd <- data.frame(logL=log10(lens)) # df of log(lengths)
( plogW <- predict(ruf_lm,nd) ) # predicted log(weights)
## 1 2
## 1.135860 1.753356
( cf <- logbtcf(ruf_lm,10) ) # correction factor
## [1] 1.004884
cf*(10^plogW) # back-transforming with bias correction
## 1 2
## 13.73965 56.94713
mlogW <- predict(ruf_lm,nd,interval="confidence")
cf*10^mlogW
## fit lwr upr
## 1 13.73965 13.49177 13.99208
## 2 56.94713 55.43960 58.49566
plogW <- predict(ruf_lm,nd,interval="prediction")
cf*10^plogW
## fit lwr upr
## 1 13.73965 11.29455 16.71406
## 2 56.94713 46.76664 69.34378
plot(logW~logL,data=ruf90,pch=19,col=rgb(0,0,0,1/4),
ylab="log Weight (g)",xlab="log Total Length (mm)")
tmp <- range(ruf90$logL)
xs <- seq(tmp[1],tmp[2],length.out=99)
ys <- predict(ruf_lm,data.frame(logL=xs))
lines(ys~xs,lwd=2)
plot(wt~tl,data=ruf90,pch=19,col=rgb(0,0,0,1/4),
ylab="Weight (g)",xlab="Total Length (mm)")
btxs <- 10^xs
btys <- cf*10^ys
lines(btys~btxs,lwd=2)
btys <- cf*10^predict(ruf_lm,data.frame(logL=xs),
interval="prediction")
head(btys,n=3)
## fit lwr upr
## 1 2.251802 1.841414 2.753652
## 2 2.333299 1.908384 2.852825
## 3 2.417746 1.977784 2.955579
plot(wt~tl,data=ruf90,pch=19,col=rgb(0,0,0,1/4),
ylab="Weight (g)",xlab="Total Length (mm)")
lines(btys[,"fit"]~btxs,col="gray20",lwd=2,lty="solid")
lines(btys[,"lwr"]~btxs,col="gray20",lwd=2,lty="dashed")
lines(btys[,"upr"]~btxs,col="gray20",lwd=2,lty="dashed")
r <- residuals(ruf_lm)
fv <- fitted(ruf_lm)
par(mfrow=c(2,2))
plot(ruf_lm)
These data provide overwhelming evidence that the weight is associated with the total length in Ruffe (p-value < 0.0001, analysis of variance F-test). There is an associated 20.6% increase in weight for each 1mm increase in total length (95% confidence interval: 19.33% to 21.95%).